1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.lucene.uninverting;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Collection;
24  import java.util.Collections;
25  import java.util.List;
26  
27  import org.apache.lucene.codecs.PostingsFormat; // javadocs
28  import org.apache.lucene.index.DocValues;
29  import org.apache.lucene.index.DocValuesType;
30  import org.apache.lucene.index.PostingsEnum;
31  import org.apache.lucene.index.FieldInfo;
32  import org.apache.lucene.index.LeafReader;
33  import org.apache.lucene.index.SortedSetDocValues;
34  import org.apache.lucene.index.Terms;
35  import org.apache.lucene.index.TermsEnum;
36  import org.apache.lucene.search.DocIdSetIterator;
37  import org.apache.lucene.util.Accountable;
38  import org.apache.lucene.util.Bits;
39  import org.apache.lucene.util.BytesRef;
40  import org.apache.lucene.util.PagedBytes;
41  import org.apache.lucene.util.StringHelper;
42  
43  /**
44   * This class enables fast access to multiple term ords for
45   * a specified field across all docIDs.
46   *
47   * Like FieldCache, it uninverts the index and holds a
48   * packed data structure in RAM to enable fast access.
49   * Unlike FieldCache, it can handle multi-valued fields,
50   * and, it does not hold the term bytes in RAM.  Rather, you
51   * must obtain a TermsEnum from the {@link #getOrdTermsEnum}
52   * method, and then seek-by-ord to get the term's bytes.
53   *
54   * While normally term ords are type long, in this API they are
55   * int as the internal representation here cannot address
56   * more than MAX_INT unique terms.  Also, typically this
57   * class is used on fields with relatively few unique terms
58   * vs the number of documents.  In addition, there is an
59   * internal limit (16 MB) on how many bytes each chunk of
60   * documents may consume.  If you trip this limit you'll hit
61   * an IllegalStateException.
62   *
63   * Deleted documents are skipped during uninversion, and if
64   * you look them up you'll get 0 ords.
65   *
66   * The returned per-document ords do not retain their
67   * original order in the document.  Instead they are returned
68   * in sorted (by ord, ie term's BytesRef comparator) order.  They
69   * are also de-dup'd (ie if doc has same term more than once
70   * in this field, you'll only get that ord back once).
71   *
72   * This class
73   * will create its own term index internally, allowing to
74   * create a wrapped TermsEnum that can handle ord.  The
75   * {@link #getOrdTermsEnum} method then provides this
76   * wrapped enum.
77   *
78   * The RAM consumption of this class can be high!
79   *
80   * @lucene.experimental
81   */
82  
83  /*
84   * Final form of the un-inverted field:
85   *   Each document points to a list of term numbers that are contained in that document.
86   *
87   *   Term numbers are in sorted order, and are encoded as variable-length deltas from the
88   *   previous term number.  Real term numbers start at 2 since 0 and 1 are reserved.  A
89   *   term number of 0 signals the end of the termNumber list.
90   *
91   *   There is a single int[maxDoc()] which either contains a pointer into a byte[] for
92   *   the termNumber lists, or directly contains the termNumber list if it fits in the 4
93   *   bytes of an integer.  If the first byte in the integer is 1, the next 3 bytes
94   *   are a pointer into a byte[] where the termNumber list starts.
95   *
96   *   There are actually 256 byte arrays, to compensate for the fact that the pointers
97   *   into the byte arrays are only 3 bytes long.  The correct byte array for a document
98   *   is a function of its id.
99   *
100  *   To save space and speed up faceting, any term that matches enough documents will
101  *   not be un-inverted... it will be skipped while building the un-inverted field structure,
102  *   and will use a set intersection method during faceting.
103  *
104  *   To further save memory, the terms (the actual string values) are not all stored in
105  *   memory, but a TermIndex is used to convert term numbers to term values only
106  *   for the terms needed after faceting has completed.  Only every 128th term value
107  *   is stored, along with its corresponding term number, and this is used as an
108  *   index to find the closest term and iterate until the desired number is hit (very
109  *   much like Lucene's own internal term index).
110  *
111  */
112 
113 public class DocTermOrds implements Accountable {
114 
115   // Term ords are shifted by this, internally, to reserve
116   // values 0 (end term) and 1 (index is a pointer into byte array)
117   private final static int TNUM_OFFSET = 2;
118 
119   /** Every 128th term is indexed, by default. */
120   public final static int DEFAULT_INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing
121 
122   private int indexIntervalBits;
123   private int indexIntervalMask;
124   private int indexInterval;
125 
126   /** Don't uninvert terms that exceed this count. */
127   protected final int maxTermDocFreq;
128 
129   /** Field we are uninverting. */
130   protected final String field;
131 
132   /** Number of terms in the field. */
133   protected int numTermsInField;
134 
135   /** Total number of references to term numbers. */
136   protected long termInstances;
137   private long memsz;
138 
139   /** Total time to uninvert the field. */
140   protected int total_time;
141 
142   /** Time for phase1 of the uninvert process. */
143   protected int phase1_time;
144 
145   /** Holds the per-document ords or a pointer to the ords. */
146   protected int[] index;
147 
148   /** Holds term ords for documents. */
149   protected byte[][] tnums = new byte[256][];
150 
151   /** Total bytes (sum of term lengths) for all indexed terms.*/
152   protected long sizeOfIndexedStrings;
153 
154   /** Holds the indexed (by default every 128th) terms. */
155   protected BytesRef[] indexedTermsArray = new BytesRef[0];
156 
157   /** If non-null, only terms matching this prefix were
158    *  indexed. */
159   protected BytesRef prefix;
160 
161   /** Ordinal of the first term in the field, or 0 if the
162    *  {@link PostingsFormat} does not implement {@link
163    *  TermsEnum#ord}. */
164   protected int ordBase;
165 
166   /** Used while uninverting. */
167   protected PostingsEnum postingsEnum;
168 
169   /** Returns total bytes used. */
170   public long ramBytesUsed() {
171     // can cache the mem size since it shouldn't change
172     if (memsz!=0) return memsz;
173     long sz = 8*8 + 32; // local fields
174     if (index != null) sz += index.length * 4;
175     if (tnums!=null) {
176       for (byte[] arr : tnums)
177         if (arr != null) sz += arr.length;
178     }
179     memsz = sz;
180     return sz;
181   }
182 
183   @Override
184   public Collection<Accountable> getChildResources() {
185     return Collections.emptyList();
186   }
187 
188   /** Inverts all terms */
189   public DocTermOrds(LeafReader reader, Bits liveDocs, String field) throws IOException {
190     this(reader, liveDocs, field, null, Integer.MAX_VALUE);
191   }
192   
193   // TODO: instead of all these ctors and options, take termsenum!
194 
195   /** Inverts only terms starting w/ prefix */
196   public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix) throws IOException {
197     this(reader, liveDocs, field, termPrefix, Integer.MAX_VALUE);
198   }
199 
200   /** Inverts only terms starting w/ prefix, and only terms
201    *  whose docFreq (not taking deletions into account) is
202    *  &lt;=  maxTermDocFreq */
203   public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq) throws IOException {
204     this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS);
205   }
206 
207   /** Inverts only terms starting w/ prefix, and only terms
208    *  whose docFreq (not taking deletions into account) is
209    *  &lt;=  maxTermDocFreq, with a custom indexing interval
210    *  (default is every 128nd term). */
211   public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) throws IOException {
212     this(field, maxTermDocFreq, indexIntervalBits);
213     uninvert(reader, liveDocs, termPrefix);
214   }
215 
216   /** Subclass inits w/ this, but be sure you then call
217    *  uninvert, only once */
218   protected DocTermOrds(String field, int maxTermDocFreq, int indexIntervalBits) {
219     //System.out.println("DTO init field=" + field + " maxTDFreq=" + maxTermDocFreq);
220     this.field = field;
221     this.maxTermDocFreq = maxTermDocFreq;
222     this.indexIntervalBits = indexIntervalBits;
223     indexIntervalMask = 0xffffffff >>> (32-indexIntervalBits);
224     indexInterval = 1 << indexIntervalBits;
225   }
226 
227   /** 
228    * Returns a TermsEnum that implements ord, or null if no terms in field.
229    * <p>
230    *  we build a "private" terms
231    *  index internally (WARNING: consumes RAM) and use that
232    *  index to implement ord.  This also enables ord on top
233    *  of a composite reader.  The returned TermsEnum is
234    *  unpositioned.  This returns null if there are no terms.
235    * </p>
236    *  <p><b>NOTE</b>: you must pass the same reader that was
237    *  used when creating this class 
238    */
239   public TermsEnum getOrdTermsEnum(LeafReader reader) throws IOException {
240     // NOTE: see LUCENE-6529 before attempting to optimize this method to
241     // return a TermsEnum directly from the reader if it already supports ord().
242 
243     assert null != indexedTermsArray;
244     
245     if (0 == indexedTermsArray.length) {
246       return null;
247     } else {
248       return new OrdWrappedTermsEnum(reader);
249     }
250   }
251 
252   /**
253    * Returns the number of terms in this field
254    */
255   public int numTerms() {
256     return numTermsInField;
257   }
258 
259   /**
260    * Returns {@code true} if no terms were indexed.
261    */
262   public boolean isEmpty() {
263     return index == null;
264   }
265 
266   /** Subclass can override this */
267   protected void visitTerm(TermsEnum te, int termNum) throws IOException {
268   }
269 
270   /** Invoked during {@link #uninvert(org.apache.lucene.index.LeafReader,Bits,BytesRef)}
271    *  to record the document frequency for each uninverted
272    *  term. */
273   protected void setActualDocFreq(int termNum, int df) throws IOException {
274   }
275 
276   /** Call this only once (if you subclass!) */
277   protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
278     final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
279     if (info != null && info.getDocValuesType() != DocValuesType.NONE) {
280       throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
281     }
282     //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
283     final long startTime = System.currentTimeMillis();
284     prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);
285 
286     final int maxDoc = reader.maxDoc();
287     final int[] index = new int[maxDoc];       // immediate term numbers, or the index into the byte[] representing the last number
288     final int[] lastTerm = new int[maxDoc];    // last term we saw for this document
289     final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
290 
291     final Terms terms = reader.terms(field);
292     if (terms == null) {
293       // No terms
294       return;
295     }
296 
297     final TermsEnum te = terms.iterator();
298     final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
299     //System.out.println("seekStart=" + seekStart.utf8ToString());
300     if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
301       // No terms match
302       return;
303     }
304 
305     // For our "term index wrapper"
306     final List<BytesRef> indexedTerms = new ArrayList<>();
307     final PagedBytes indexedTermsBytes = new PagedBytes(15);
308 
309     // we need a minimum of 9 bytes, but round up to 12 since the space would
310     // be wasted with most allocators anyway.
311     byte[] tempArr = new byte[12];
312 
313     //
314     // enumerate all terms, and build an intermediate form of the un-inverted field.
315     //
316     // During this intermediate form, every document has a (potential) byte[]
317     // and the int[maxDoc()] array either contains the termNumber list directly
318     // or the *end* offset of the termNumber list in its byte array (for faster
319     // appending and faster creation of the final form).
320     //
321     // idea... if things are too large while building, we could do a range of docs
322     // at a time (but it would be a fair amount slower to build)
323     // could also do ranges in parallel to take advantage of multiple CPUs
324 
325     // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
326     // values.  This requires going over the field first to find the most
327     // frequent terms ahead of time.
328 
329     int termNum = 0;
330     postingsEnum = null;
331 
332     // Loop begins with te positioned to first term (we call
333     // seek above):
334     for (;;) {
335       final BytesRef t = te.term();
336       if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
337         break;
338       }
339       //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
340 
341       visitTerm(te, termNum);
342 
343       if ((termNum & indexIntervalMask) == 0) {
344         // Index this term
345         sizeOfIndexedStrings += t.length;
346         BytesRef indexedTerm = new BytesRef();
347         indexedTermsBytes.copy(t, indexedTerm);
348         // TODO: really should 1) strip off useless suffix,
349         // and 2) use FST not array/PagedBytes
350         indexedTerms.add(indexedTerm);
351       }
352 
353       final int df = te.docFreq();
354       if (df <= maxTermDocFreq) {
355 
356         postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);
357 
358         // dF, but takes deletions into account
359         int actualDF = 0;
360 
361         for (;;) {
362           int doc = postingsEnum.nextDoc();
363           if (doc == DocIdSetIterator.NO_MORE_DOCS) {
364             break;
365           }
366           //System.out.println("  chunk=" + chunk + " docs");
367 
368           actualDF ++;
369           termInstances++;
370           
371           //System.out.println("    docID=" + doc);
372           // add TNUM_OFFSET to the term number to make room for special reserved values:
373           // 0 (end term) and 1 (index into byte array follows)
374           int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
375           lastTerm[doc] = termNum;
376           int val = index[doc];
377 
378           if ((val & 0xff)==1) {
379             // index into byte array (actually the end of
380             // the doc-specific byte[] when building)
381             int pos = val >>> 8;
382             int ilen = vIntSize(delta);
383             byte[] arr = bytes[doc];
384             int newend = pos+ilen;
385             if (newend > arr.length) {
386               // We avoid a doubling strategy to lower memory usage.
387               // this faceting method isn't for docs with many terms.
388               // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
389               // TODO: figure out what array lengths we can round up to w/o actually using more memory
390               // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
391               // It should be safe to round up to the nearest 32 bits in any case.
392               int newLen = (newend + 3) & 0xfffffffc;  // 4 byte alignment
393               byte[] newarr = new byte[newLen];
394               System.arraycopy(arr, 0, newarr, 0, pos);
395               arr = newarr;
396               bytes[doc] = newarr;
397             }
398             pos = writeInt(delta, arr, pos);
399             index[doc] = (pos<<8) | 1;  // update pointer to end index in byte[]
400           } else {
401             // OK, this int has data in it... find the end (a zero starting byte - not
402             // part of another number, hence not following a byte with the high bit set).
403             int ipos;
404             if (val==0) {
405               ipos=0;
406             } else if ((val & 0x0000ff80)==0) {
407               ipos=1;
408             } else if ((val & 0x00ff8000)==0) {
409               ipos=2;
410             } else if ((val & 0xff800000)==0) {
411               ipos=3;
412             } else {
413               ipos=4;
414             }
415 
416             //System.out.println("      ipos=" + ipos);
417 
418             int endPos = writeInt(delta, tempArr, ipos);
419             //System.out.println("      endpos=" + endPos);
420             if (endPos <= 4) {
421               //System.out.println("      fits!");
422               // value will fit in the integer... move bytes back
423               for (int j=ipos; j<endPos; j++) {
424                 val |= (tempArr[j] & 0xff) << (j<<3);
425               }
426               index[doc] = val;
427             } else {
428               // value won't fit... move integer into byte[]
429               for (int j=0; j<ipos; j++) {
430                 tempArr[j] = (byte)val;
431                 val >>>=8;
432               }
433               // point at the end index in the byte[]
434               index[doc] = (endPos<<8) | 1;
435               bytes[doc] = tempArr;
436               tempArr = new byte[12];
437             }
438           }
439         }
440         setActualDocFreq(termNum, actualDF);
441       }
442 
443       termNum++;
444       if (te.next() == null) {
445         break;
446       }
447     }
448 
449     numTermsInField = termNum;
450 
451     long midPoint = System.currentTimeMillis();
452 
453     if (termInstances == 0) {
454       // we didn't invert anything
455       // lower memory consumption.
456       tnums = null;
457     } else {
458 
459       this.index = index;
460 
461       //
462       // transform intermediate form into the final form, building a single byte[]
463       // at a time, and releasing the intermediate byte[]s as we go to avoid
464       // increasing the memory footprint.
465       //
466 
467       for (int pass = 0; pass<256; pass++) {
468         byte[] target = tnums[pass];
469         int pos=0;  // end in target;
470         if (target != null) {
471           pos = target.length;
472         } else {
473           target = new byte[4096];
474         }
475 
476         // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
477         // where pp is the pass (which array we are building), and xx is all values.
478         // each pass shares the same byte[] for termNumber lists.
479         for (int docbase = pass<<16; docbase<maxDoc; docbase+=(1<<24)) {
480           int lim = Math.min(docbase + (1<<16), maxDoc);
481           for (int doc=docbase; doc<lim; doc++) {
482             //System.out.println("  pass=" + pass + " process docID=" + doc);
483             int val = index[doc];
484             if ((val&0xff) == 1) {
485               int len = val >>> 8;
486               //System.out.println("    ptr pos=" + pos);
487               index[doc] = (pos<<8)|1; // change index to point to start of array
488               if ((pos & 0xff000000) != 0) {
489                 // we only have 24 bits for the array index
490                 throw new IllegalStateException("Too many values for UnInvertedField faceting on field "+field);
491               }
492               byte[] arr = bytes[doc];
493               /*
494               for(byte b : arr) {
495                 //System.out.println("      b=" + Integer.toHexString((int) b));
496               }
497               */
498               bytes[doc] = null;        // IMPORTANT: allow GC to avoid OOM
499               if (target.length <= pos + len) {
500                 int newlen = target.length;
501                 /*** we don't have to worry about the array getting too large
502                  * since the "pos" param will overflow first (only 24 bits available)
503                 if ((newlen<<1) <= 0) {
504                   // overflow...
505                   newlen = Integer.MAX_VALUE;
506                   if (newlen <= pos + len) {
507                     throw new SolrException(400,"Too many terms to uninvert field!");
508                   }
509                 } else {
510                   while (newlen <= pos + len) newlen<<=1;  // doubling strategy
511                 }
512                 ****/
513                 while (newlen <= pos + len) newlen<<=1;  // doubling strategy                 
514                 byte[] newtarget = new byte[newlen];
515                 System.arraycopy(target, 0, newtarget, 0, pos);
516                 target = newtarget;
517               }
518               System.arraycopy(arr, 0, target, pos, len);
519               pos += len + 1;  // skip single byte at end and leave it 0 for terminator
520             }
521           }
522         }
523 
524         // shrink array
525         if (pos < target.length) {
526           byte[] newtarget = new byte[pos];
527           System.arraycopy(target, 0, newtarget, 0, pos);
528           target = newtarget;
529         }
530         
531         tnums[pass] = target;
532 
533         if ((pass << 16) > maxDoc)
534           break;
535       }
536 
537     }
538     indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
539 
540     long endTime = System.currentTimeMillis();
541 
542     total_time = (int)(endTime-startTime);
543     phase1_time = (int)(midPoint-startTime);
544   }
545 
546   /** Number of bytes to represent an unsigned int as a vint. */
547   private static int vIntSize(int x) {
548     if ((x & (0xffffffff << (7*1))) == 0 ) {
549       return 1;
550     }
551     if ((x & (0xffffffff << (7*2))) == 0 ) {
552       return 2;
553     }
554     if ((x & (0xffffffff << (7*3))) == 0 ) {
555       return 3;
556     }
557     if ((x & (0xffffffff << (7*4))) == 0 ) {
558       return 4;
559     }
560     return 5;
561   }
562 
563   // todo: if we know the size of the vInt already, we could do
564   // a single switch on the size
565   private static int writeInt(int x, byte[] arr, int pos) {
566     int a;
567     a = (x >>> (7*4));
568     if (a != 0) {
569       arr[pos++] = (byte)(a | 0x80);
570     }
571     a = (x >>> (7*3));
572     if (a != 0) {
573       arr[pos++] = (byte)(a | 0x80);
574     }
575     a = (x >>> (7*2));
576     if (a != 0) {
577       arr[pos++] = (byte)(a | 0x80);
578     }
579     a = (x >>> (7*1));
580     if (a != 0) {
581       arr[pos++] = (byte)(a | 0x80);
582     }
583     arr[pos++] = (byte)(x & 0x7f);
584     return pos;
585   }
586 
587   /** 
588    * "wrap" our own terms index around the original IndexReader. 
589    * Only valid if there are terms for this field rom the original reader
590    */
591   private final class OrdWrappedTermsEnum extends TermsEnum {
592     private final TermsEnum termsEnum;
593     private BytesRef term;
594     private long ord = -indexInterval-1;          // force "real" seek
595     
596     public OrdWrappedTermsEnum(LeafReader reader) throws IOException {
597       assert indexedTermsArray != null;
598       assert 0 != indexedTermsArray.length;
599       termsEnum = reader.fields().terms(field).iterator();
600     }
601 
602     @Override    
603     public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
604       return termsEnum.postings(reuse, flags);
605     }
606 
607     @Override
608     public BytesRef term() {
609       return term;
610     }
611 
612     @Override
613     public BytesRef next() throws IOException {
614       if (++ord < 0) {
615         ord = 0;
616       }
617       if (termsEnum.next() == null) {
618         term = null;
619         return null;
620       }
621       return setTerm();  // this is extra work if we know we are in bounds...
622     }
623 
624     @Override
625     public int docFreq() throws IOException {
626       return termsEnum.docFreq();
627     }
628 
629     @Override
630     public long totalTermFreq() throws IOException {
631       return termsEnum.totalTermFreq();
632     }
633 
634     @Override
635     public long ord() {
636       return ordBase + ord;
637     }
638 
639     @Override
640     public SeekStatus seekCeil(BytesRef target) throws IOException {
641 
642       // already here
643       if (term != null && term.equals(target)) {
644         return SeekStatus.FOUND;
645       }
646 
647       int startIdx = Arrays.binarySearch(indexedTermsArray, target);
648 
649       if (startIdx >= 0) {
650         // we hit the term exactly... lucky us!
651         TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
652         assert seekStatus == TermsEnum.SeekStatus.FOUND;
653         ord = startIdx << indexIntervalBits;
654         setTerm();
655         assert term != null;
656         return SeekStatus.FOUND;
657       }
658 
659       // we didn't hit the term exactly
660       startIdx = -startIdx-1;
661     
662       if (startIdx == 0) {
663         // our target occurs *before* the first term
664         TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
665         assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
666         ord = 0;
667         setTerm();
668         assert term != null;
669         return SeekStatus.NOT_FOUND;
670       }
671 
672       // back up to the start of the block
673       startIdx--;
674 
675       if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
676         // we are already in the right block and the current term is before the term we want,
677         // so we don't need to seek.
678       } else {
679         // seek to the right block
680         TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]);
681         assert seekStatus == TermsEnum.SeekStatus.FOUND;
682         ord = startIdx << indexIntervalBits;
683         setTerm();
684         assert term != null;  // should be non-null since it's in the index
685       }
686 
687       while (term != null && term.compareTo(target) < 0) {
688         next();
689       }
690 
691       if (term == null) {
692         return SeekStatus.END;
693       } else if (term.compareTo(target) == 0) {
694         return SeekStatus.FOUND;
695       } else {
696         return SeekStatus.NOT_FOUND;
697       }
698     }
699 
700     @Override
701     public void seekExact(long targetOrd) throws IOException {
702       int delta = (int) (targetOrd - ordBase - ord);
703       //System.out.println("  seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord + " ii=" + indexInterval);
704       if (delta < 0 || delta > indexInterval) {
705         final int idx = (int) (targetOrd >>> indexIntervalBits);
706         final BytesRef base = indexedTermsArray[idx];
707         //System.out.println("  do seek term=" + base.utf8ToString());
708         ord = idx << indexIntervalBits;
709         delta = (int) (targetOrd - ord);
710         final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base);
711         assert seekStatus == TermsEnum.SeekStatus.FOUND;
712       } else {
713         //System.out.println("seek w/in block");
714       }
715 
716       while (--delta >= 0) {
717         BytesRef br = termsEnum.next();
718         if (br == null) {
719           assert false;
720           return;
721         }
722         ord++;
723       }
724 
725       setTerm();
726       assert term != null;
727     }
728 
729     private BytesRef setTerm() throws IOException {
730       term = termsEnum.term();
731       //System.out.println("  setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == null ? "null" : prefix.utf8ToString()));
732       if (prefix != null && !StringHelper.startsWith(term, prefix)) {
733         term = null;
734       }
735       return term;
736     }
737   }
738 
739   /** Returns the term ({@link BytesRef}) corresponding to
740    *  the provided ordinal. */
741   public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
742     termsEnum.seekExact(ord);
743     return termsEnum.term();
744   }
745   
746   /** Returns a SortedSetDocValues view of this instance */
747   public SortedSetDocValues iterator(LeafReader reader) throws IOException {
748     if (isEmpty()) {
749       return DocValues.emptySortedSet();
750     } else {
751       return new Iterator(reader);
752     }
753   }
754   
755   private class Iterator extends SortedSetDocValues {
756     final LeafReader reader;
757     final TermsEnum te;  // used internally for lookupOrd() and lookupTerm()
758     // currently we read 5 at a time (using the logic of the old iterator)
759     final int buffer[] = new int[5];
760     int bufferUpto;
761     int bufferLength;
762     
763     private int tnum;
764     private int upto;
765     private byte[] arr;
766     
767     Iterator(LeafReader reader) throws IOException {
768       this.reader = reader;
769       this.te = termsEnum();
770     }
771     
772     @Override
773     public long nextOrd() {
774       while (bufferUpto == bufferLength) {
775         if (bufferLength < buffer.length) {
776           return NO_MORE_ORDS;
777         } else {
778           bufferLength = read(buffer);
779           bufferUpto = 0;
780         }
781       }
782       return buffer[bufferUpto++];
783     }
784     
785     /** Buffer must be at least 5 ints long.  Returns number
786      *  of term ords placed into buffer; if this count is
787      *  less than buffer.length then that is the end. */
788     int read(int[] buffer) {
789       int bufferUpto = 0;
790       if (arr == null) {
791         // code is inlined into upto
792         //System.out.println("inlined");
793         int code = upto;
794         int delta = 0;
795         for (;;) {
796           delta = (delta << 7) | (code & 0x7f);
797           if ((code & 0x80)==0) {
798             if (delta==0) break;
799             tnum += delta - TNUM_OFFSET;
800             buffer[bufferUpto++] = ordBase+tnum;
801             //System.out.println("  tnum=" + tnum);
802             delta = 0;
803           }
804           code >>>= 8;
805         }
806       } else {
807         // code is a pointer
808         for(;;) {
809           int delta = 0;
810           for(;;) {
811             byte b = arr[upto++];
812             delta = (delta << 7) | (b & 0x7f);
813             //System.out.println("    cycle: upto=" + upto + " delta=" + delta + " b=" + b);
814             if ((b & 0x80) == 0) break;
815           }
816           //System.out.println("  delta=" + delta);
817           if (delta == 0) break;
818           tnum += delta - TNUM_OFFSET;
819           //System.out.println("  tnum=" + tnum);
820           buffer[bufferUpto++] = ordBase+tnum;
821           if (bufferUpto == buffer.length) {
822             break;
823           }
824         }
825       }
826 
827       return bufferUpto;
828     }
829 
830     @Override
831     public void setDocument(int docID) {
832       tnum = 0;
833       final int code = index[docID];
834       if ((code & 0xff)==1) {
835         // a pointer
836         upto = code>>>8;
837         //System.out.println("    pointer!  upto=" + upto);
838         int whichArray = (docID >>> 16) & 0xff;
839         arr = tnums[whichArray];
840       } else {
841         //System.out.println("    inline!");
842         arr = null;
843         upto = code;
844       }
845       bufferUpto = 0;
846       bufferLength = read(buffer);
847     }
848 
849     @Override
850     public BytesRef lookupOrd(long ord) {
851       try {
852         return DocTermOrds.this.lookupTerm(te, (int) ord);
853       } catch (IOException e) {
854         throw new RuntimeException(e);
855       }
856     }
857 
858     @Override
859     public long getValueCount() {
860       return numTerms();
861     }
862 
863     @Override
864     public long lookupTerm(BytesRef key) {
865       try {
866         switch (te.seekCeil(key)) {
867           case FOUND:           
868             assert te.ord() >= 0;
869             return te.ord();
870           case NOT_FOUND:
871             assert te.ord() >= 0;
872             return -te.ord()-1;
873           default: /* END */
874             return -numTerms()-1;
875         }
876       } catch (IOException e) {
877         throw new RuntimeException(e);
878       }
879     }
880     
881     @Override
882     public TermsEnum termsEnum() {    
883       try {
884         return getOrdTermsEnum(reader);
885       } catch (IOException e) {
886         throw new RuntimeException(e);
887       }
888     }
889   }
890 }